In this script I integrate the mouse neuron data with publically available datasets and compare PCA and WGCNA for gene module discovery.
Load the required R libraries:
library(readxl)
library(dplyr)
library(Seurat)
library(myUtils)
library(EnsDb.Mmusculus.v75)
library(lubridate)
library(ComplexHeatmap)
library(R.matlab)
library(WGCNA)
library(biomaRt)
library(org.Mm.eg.db)
library(gridExtra)
library(grid)
Here I 1.) load the data 2.) change the gene identifiers from ensemble ids to symbols, where this is possible (otherwise I leave the ensembl id in place) 3.) load the metadata 4.) I put the data and metadata into a Seurat object. 5.) I also load mitochondrial mouse genes from a database for later use.
directory = '/home/jovyan/axonGrowth/'
setwd(directory)
axonGrowth_counts = read.csv('counts.csv', row.names = 1)
rownames(axonGrowth_counts) = unlist(lapply(1:length(rownames(axonGrowth_counts)), function(x) strsplit(rownames(axonGrowth_counts)[x], split = '\\.')[[1]][1]))
symbols = mapIdsMouse(rownames(axonGrowth_counts), 'ENSEMBL', 'SYMBOL')
mitogenes <- genes(EnsDb.Mmusculus.v75, filter = ~ seq_name == "MT")$gene_id
percent.mt = colSums(axonGrowth_counts[rownames(axonGrowth_counts) %in% mitogenes,])/colSums(axonGrowth_counts)
rownames(axonGrowth_counts) = symbols
#sum(unlist(lapply(1:dim(axonGrowth_counts)[1], function(x) substring(rownames(axonGrowth_counts)[x], 1,3))) != 'ENS')/dim(axonGrowth_counts)[1]
metadata = read_xlsx('Summary-neurons_microcoverslips_RNAseq.xlsx', col_types = c('text', 'text', 'date', 'date', 'date', 'date', 'date', rep('text', 7)), skip = 1)
metadata = metadata[10:105,]
axonGrowth = CreateSeuratObject(axonGrowth_counts, project = 'axonGrowth', min.cells = 0, min.features = 0)
axonGrowth$TotalNeuriteLength = as.numeric(metadata$`Total Neurite Length (um)`)
axonGrowth$LongestNeuriteLength = as.numeric(metadata$`Longest Neurite Length (um)`)
axonGrowth$TimeSincePlating = unlist(lapply(1:dim(metadata)[1], function(x) interval(ymd_hms(metadata$`Time of plating (mm/dd/yyyy hh:mm)`[x]), ymd_hms(metadata$`Cell collection time (mm/dd/yyyy hh:mm)`[x]))/dhours(1)))
axonGrowth$TimeSincePlating = unlist(lapply(1:dim(metadata)[1], function(x) interval(ymd_hms(metadata$`Time of plating (mm/dd/yyyy hh:mm)`[x]), ymd_hms(metadata$`Cell collection time (mm/dd/yyyy hh:mm)`[x]))/dhours(1)))
axonGrowth$TimeInAIRMEM = unlist(lapply(1:dim(metadata)[1], function(x) interval(ymd_hms(metadata$`AIR-MEM change (mm/dd/yyyy hh:mm)`[x]), ymd_hms(metadata$`Cell collection time (mm/dd/yyyy hh:mm)`[x]))/dhours(1)))
The QC plots using number of detected genes, number of counts and percent of counts coming from mitochondrial genes (as a proxy for stress), show a couple of outlier cells, which I remove in the last line:
axonGrowth[["percent.mt"]] <- percent.mt
VlnPlot(axonGrowth, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(axonGrowth, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(axonGrowth, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))
axonGrowth <- subset(axonGrowth, subset = nFeature_RNA > 10000 & nFeature_RNA < 20000 & percent.mt < 2.5 & nCount_RNA > 10^5 & nCount_RNA < 10^6)
VlnPlot(axonGrowth, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
The following normalizes, scales and select 1000 particularly variable genes. Doing the downstream analysis with only the 1000 most variable genes seems like a waste of information. However, given we only have 85 high quality cells available for this analysis at the moment, it will give us the most robust clustering results.
axonGrowth <- NormalizeData(axonGrowth, normalization.method = "LogNormalize", scale.factor = 10000)
axonGrowth <- FindVariableFeatures(axonGrowth, selection.method = "vst", nfeatures = 1000)
top25 <- head(VariableFeatures(axonGrowth), 25)
plot1 <- VariableFeaturePlot(axonGrowth)
plot2 <- LabelPoints(plot = plot1, points = top25, repel = TRUE)
CombinePlots(plots = list(plot1, plot2))
all.genes <- rownames(axonGrowth)
axonGrowth <- ScaleData(axonGrowth, features = all.genes)
These are the results of the PCA analysis:
axonGrowth <- RunPCA(axonGrowth, features = VariableFeatures(object = axonGrowth))
print(axonGrowth[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: ENSMUSG00000105285, ENSMUSG00000111824, ENSMUSG00000092805, Tmem170, ENSMUSG00000106451
## Negative: Ppia, Rtn3, Hnrnpk, Calm1, ENSMUSG00000044285
## PC_ 2
## Positive: Cyba, Ctss, Lyz2, Tyrobp, Hexb
## Negative: Stmn4, Mapt, Nrep, Map1b, Stmn2
## PC_ 3
## Positive: Cldn3, Rpl17, Smurf2, Prxl2a, Zfp334
## Negative: Syt3, Lao1, Tspan12, Ado, Pcdhb16
## PC_ 4
## Positive: Zscan18, Rtf2, Kctd2, Ddit3, Rps6kc1
## Negative: Zdhhc8, Acap3, Stard7, Pgm1, Aatk
## PC_ 5
## Positive: ENSMUSG00000096972, Slc19a1, Ndufv1, Atg101, Irf3
## Negative: Gadd45g, Slc38a2, Plppr1, Gli3, Cdpf1
VizDimLoadings(axonGrowth, dims = 1:2, reduction = "pca")
DimPlot(axonGrowth, reduction = "pca")
DimHeatmap(axonGrowth, dims = 1, cells = round(dim(axonGrowth)[2])/2, balanced = TRUE)
DimHeatmap(axonGrowth, dims = 1:15, cells = round(dim(axonGrowth)[2])/2, balanced = TRUE)
Based on the JackStraw procedure I select 12 PCs for further analysis:
axonGrowth <- JackStraw(axonGrowth, num.replicate = 100)
axonGrowth <- ScoreJackStraw(axonGrowth, dims = 1:20)
JackStrawPlot(axonGrowth, dims = 1:20)
ElbowPlot(axonGrowth, ndims = 40)
n_components = 12
Let’s load the data from https://science.sciencemag.org/content/360/6385/176 as a reference:
reference = readMat('/home/jovyan/data/developingMouse/GSM3017261_150000_CNS_nuclei.mat')
We focus on hippocampal neurons from the mouse brain:
region = unlist(lapply(reference$cluster.assignment, function(x) strsplit(x, split = ' ')[[1]][2]))
subset = which(region == 'HIPP')
reference$DGE = reference$DGE[subset,]
reference$barcodes = reference$barcodes[subset]
reference$cluster.assignment = reference$cluster.assignment[subset]
reference$sample.type = reference$sample.type[subset]
reference$genes = trimws(reference$genes)
subset_genes = which(reference$genes %in% rownames(axonGrowth))
reference$genes = reference$genes[subset_genes]
reference$DGE = reference$DGE[,subset_genes]
reference_matrix = as.matrix(reference$DGE)
reference_matrix = t(reference_matrix)
rownames(reference_matrix) = reference$genes
colnames(reference_matrix) = reference$barcodes
reference$DGE = NULL
Reference = CreateSeuratObject(reference_matrix)
Reference <- NormalizeData(Reference, verbose = FALSE)
Reference <- FindVariableFeatures(Reference, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
Now we integrate the two datasets using the method provided by Seurat (see https://www.sciencedirect.com/science/article/pii/S0092867419305598?via%3Dihub):
axonGrowth.anchors <- FindIntegrationAnchors(object.list = list(Reference, axonGrowth), dims = 1:30, verbose = TRUE, k.filter = 77)
axonGrowth.integrated <- IntegrateData(anchorset = axonGrowth.anchors, dims = 1:30)
Let’s vizualize differentiation trajectories with the markers suggested in the reference paper and see how our data relates to the reference data on a UMAP plot:
# Run the standard workflow for visualization and clustering
axonGrowth.integrated <- ScaleData(axonGrowth.integrated, verbose = FALSE)
axonGrowth.integrated <- RunPCA(axonGrowth.integrated, npcs = 30, verbose = FALSE)
axonGrowth.integrated <- RunUMAP(axonGrowth.integrated, reduction = "pca", dims = 1:5)
axonGrowth.integrated$orig.ident[axonGrowth.integrated$orig.ident == 'SeuratProject'] = 'Reference'
axonGrowth.integrated$orig.ident[axonGrowth.integrated$orig.ident == 'axonGrowth'] = 'Our data'
DimPlot(axonGrowth.integrated, reduction = "umap", group.by = "orig.ident")
FeaturePlot(axonGrowth.integrated, features = c('Mki67', 'Prox1', 'Dock10', 'Meis2', 'Spock1'))
FeaturePlot(axonGrowth.integrated, features = c('Mki67', 'Prox1', 'Dock10', 'Meis2', 'Spock1'), cells = colnames(axonGrowth.integrated)[axonGrowth.integrated$orig.ident == 'Our data'])
Now we find correlated gene modules with WGCNA, see https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/index.html and see to what extent their eigengene correlates with total neurite length and longest neurite length:
options(stringsAsFactors = FALSE);
enableWGCNAThreads()
## Allowing parallel execution with up to 23 working processes.
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(t(axonGrowth.integrated@assays$integrated@scale.data), powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 2000.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 2000 of 2000
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.878 -3.40 0.968 3.49e+01 3.03e+01 1.27e+02
## 2 2 0.937 -2.42 0.969 1.92e+00 1.06e+00 2.32e+01
## 3 3 0.896 -1.96 0.905 2.61e-01 6.64e-02 6.83e+00
## 4 4 0.382 -2.41 0.349 6.02e-02 5.72e-03 2.50e+00
## 5 5 0.339 -2.05 0.223 1.89e-02 6.17e-04 1.04e+00
## 6 6 0.319 -1.86 0.140 7.24e-03 7.39e-05 5.32e-01
## 7 7 0.346 -2.70 0.247 3.20e-03 9.76e-06 3.47e-01
## 8 8 0.353 -2.54 0.265 1.58e-03 1.37e-06 2.33e-01
## 9 9 0.357 -2.35 0.275 8.43e-04 1.95e-07 1.59e-01
## 10 10 0.364 -2.28 0.283 4.80e-04 2.76e-08 1.10e-01
## 11 12 0.359 -2.11 0.281 1.78e-04 6.31e-10 5.39e-02
## 12 14 0.335 -2.15 0.149 7.38e-05 1.41e-11 2.70e-02
## 13 16 0.325 -1.96 0.142 3.28e-05 3.40e-13 1.37e-02
## 14 18 0.330 -1.91 0.147 1.52e-05 8.77e-15 6.98e-03
## 15 20 0.335 -1.97 0.149 7.24e-06 2.22e-16 3.59e-03
# Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
softPower = 5;
adjacency = adjacency(t(axonGrowth.integrated@assays$integrated@scale.data), power = softPower)
TOM = TOMsimilarity(adjacency);
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM = 1-TOM
geneTree = hclust(as.dist(dissTOM), method = "average");
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04);
# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 30;
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
## ..cutHeight not given, setting it to 1 ===> 99% of the (truncated) height range in dendro.
## ..done.
table(dynamicMods)
## dynamicMods
## 0 1 2 3
## 1596 265 80 59
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
## dynamicColors
## blue brown grey turquoise
## 80 59 1596 265
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
datTraits = data.frame(TotalNeuriteLength = axonGrowth.integrated$TotalNeuriteLength,
LongestNeuriteLength = axonGrowth.integrated$LongestNeuriteLength)
MEs0 = moduleEigengenes(t(axonGrowth.integrated@assays$integrated@scale.data), dynamicColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p");
nSamples = dim(axonGrowth.integrated@assays$integrated@scale.data)[2]
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
Finally let’s see how the gene modules relate to the UMAP plot:
axonGrowth.integrated$MEturquoise = MEs$MEturquoise
axonGrowth.integrated$MEblue = MEs$MEblue
axonGrowth.integrated$MEbrown = MEs$MEbrown
axonGrowth.integrated$MEgrey = MEs$MEgrey
FeaturePlot(axonGrowth.integrated, features = c('MEturquoise', 'MEblue', 'MEbrown', 'MEgrey'))
And what go functions they are enriched for:
library(topGO)
for (module in c('turquoise', 'blue', 'brown', 'grey')){
print(module)
all_genes = rep(0,dim(axonGrowth.integrated@assays$integrated@scale.data)[1])
names(all_genes) = rownames(axonGrowth.integrated@assays$integrated@scale.data)
all_genes[dynamicColors == module] = 1
GOdata <- new("topGOdata", ontology = "BP", allGenes = all_genes, geneSel = function(p) p <
0.05, description = "Test", annot = annFUN.org, mapping = "org.Mm.eg.db",
ID = "symbol")
resultFisher <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
tab = GenTable(GOdata, classicFisher = resultFisher, topNodes = 10)
d = as.data.frame(as.matrix(tab[,c(1,2,6)]))
p = grid.table(d)
print(p)
print(d)
}
## [1] "turquoise"
## NULL
## GO.ID Term classicFisher
## 1 GO:0007049 cell cycle 5.3e-05
## 2 GO:0009056 catabolic process 0.00019
## 3 GO:0010564 regulation of cell cycle process 0.00026
## 4 GO:0022402 cell cycle process 0.00043
## 5 GO:0002376 immune system process 0.00046
## 6 GO:0008152 metabolic process 0.00051
## 7 GO:0071704 organic substance metabolic process 0.00055
## 8 GO:0044238 primary metabolic process 0.00058
## 9 GO:0044237 cellular metabolic process 0.00066
## 10 GO:0006807 nitrogen compound metabolic process 0.00068
## [1] "blue"
## NULL
## GO.ID Term classicFisher
## 1 GO:0007017 microtubule-based process 0.018
## 2 GO:1901615 organic hydroxy compound metabolic proce... 0.037
## 3 GO:0000226 microtubule cytoskeleton organization 0.052
## 4 GO:0016358 dendrite development 0.052
## 5 GO:0001701 in utero embryonic development 0.057
## 6 GO:0032990 cell part morphogenesis 0.060
## 7 GO:0051301 cell division 0.060
## 8 GO:0048858 cell projection morphogenesis 0.061
## 9 GO:0120039 plasma membrane bounded cell projection ... 0.061
## 10 GO:0030182 neuron differentiation 0.062
## [1] "brown"
## NULL
## GO.ID Term classicFisher
## 1 GO:0009059 macromolecule biosynthetic process 0.0021
## 2 GO:1901360 organic cyclic compound metabolic proces... 0.0028
## 3 GO:0032989 cellular component morphogenesis 0.0033
## 4 GO:0034645 cellular macromolecule biosynthetic proc... 0.0034
## 5 GO:0000902 cell morphogenesis 0.0050
## 6 GO:0090304 nucleic acid metabolic process 0.0054
## 7 GO:0006725 cellular aromatic compound metabolic pro... 0.0055
## 8 GO:0016070 RNA metabolic process 0.0057
## 9 GO:0046483 heterocycle metabolic process 0.0065
## 10 GO:0099536 synaptic signaling 0.0065
## [1] "grey"
## NULL
## GO.ID Term classicFisher
## 1 GO:0009605 response to external stimulus 1.1e-10
## 2 GO:0002274 myeloid leukocyte activation 7.3e-08
## 3 GO:0030593 neutrophil chemotaxis 7.5e-08
## 4 GO:0071621 granulocyte chemotaxis 2.3e-07
## 5 GO:0098742 cell-cell adhesion via plasma-membrane a... 2.3e-07
## 6 GO:0032879 regulation of localization 4.5e-07
## 7 GO:0007155 cell adhesion 5.2e-07
## 8 GO:0051049 regulation of transport 6.0e-07
## 9 GO:0050808 synapse organization 6.0e-07
## 10 GO:0048002 antigen processing and presentation of p... 6.2e-07
The ‘blue module’ is moderately correlated with Neurite Length and enriched for known GO processes involved in neurite growth. The genes contained in this module are printed out below:
print(names(all_genes)[dynamicColors == 'blue'])
## [1] "Csf1r" "Spp1" "Anxa3" "Trem2" "Apobec1" "Plin2"
## [7] "Hvcn1" "Itgam" "Gpx1" "Sh3bgrl3" "Lgals3" "Fth1"
## [13] "Lyz2" "Ftl1" "Ctsz" "Ctss" "Cd68" "Gpnmb"
## [19] "Capg" "Laptm5" "B2m" "Tyrobp" "Mmp12" "Fcer1g"
## [25] "H2-D1" "Arpc1b" "Cybb" "Anxa1" "Mfge8" "Mpeg1"
## [31] "Fcgr3" "Cyba" "C3ar1" "Trf" "H2-K1" "Anxa2"
## [37] "Ccl9" "Cd53" "Sat1" "C5ar1" "Pfn1" "Itgb2"
## [43] "Gstm1" "Cd36" "Atp6v0d2" "Unc93b1" "Cd52" "Tnfrsf12a"
## [49] "Cav2" "Plaur" "Aldoc" "Cd93" "Ninj1" "Cdkn1a"
## [55] "Pirb" "S100a1" "Rac2" "Cd72" "Fbxo43" "Hoxa3"
## [61] "Tmem217" "Tspo" "Ankrd37" "Swsap1" "Tmem119" "Bank1"
## [67] "Naip6" "Ado" "Oas1a" "Mgst1" "Nfe2l2" "Dnase1l1"
## [73] "Snrnp25" "Mvp" "Gsn" "Kcnn4" "Uap1l1" "Tlr2"
## [79] "Tlr13" "Cldn3"
However, the below plot that shows Longest Neurite Length as a function of this modules eigengene (PC1), shows no clear association between the two:
plot(axonGrowth.integrated$MEblue, axonGrowth.integrated$TotalNeuriteLength, pch = '.', cex = 10, xlab = 'Blue Module PCA1-score', ylab = 'LongestNeuriteLength')